Exploratory Data Analysis of COVID19 dataset
Analyzing present condition of COVID19
COVID19 Outbreak - Data Visualization
COVID19 Outbreak - Prediction using Machine Learning
#Install Packages
#import pip
#package_names=['sklearn', 'fbprophet'] #packages to install
#pip.main(['install'] + package_names + ['--upgrade'])
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#Reading Data using panda libraries
data=pd.read_csv('C:\\Jupyter\\covid_19_data.csv' ,parse_dates=['Last Update'])
#Renaming the column names
data.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)
#calling the head of the data frame
data.head()
#Check for number of rows & columns
print("No. of Rows & Columns:" + str(data.shape))
#checking if null is present in dataset
print(data.isnull().sum())
#Grouping of data based on COuntry and Date
df = data.groupby(["Date", "Country"])[['Date', 'Country', 'Confirmed', 'Deaths', 'Recovered']].sum().reset_index()
#Sort the data based on Confirmed Cases
sort_By_Confirmed_cases=df.sort_values('Confirmed',ascending=False)
#Removing the duplicate country
sort_By_Confirmed_cases=sort_By_Confirmed_cases.drop_duplicates('Country')
sort_By_Confirmed_cases.head()
#making a different set of confirmed, Death & Recovered cases for the world
Confirmed_cases_for_world=sort_By_Confirmed_cases['Confirmed'].sum()
Deaths_cases_for_world=sort_By_Confirmed_cases['Deaths'].sum()
Recovered_cases_for_world=sort_By_Confirmed_cases['Recovered'].sum()
#Active cases by subracting deaths and recovered from confirmed cases
Active_cases=Confirmed_cases_for_world-Deaths_cases_for_world-Recovered_cases_for_world
print("Confirmed cases around the world: " + str(Confirmed_cases_for_world))
print("Deaths around the world: " + str(Deaths_cases_for_world))
print("Recovered cases around the world: " + str(Recovered_cases_for_world))
print("Active cases around the world: " + str(Active_cases))
#Finding the death percentage
Deaths_rate=(Deaths_cases_for_world*100)/Confirmed_cases_for_world
#finding the recovered rate percentage
Recovered_rate=(Recovered_cases_for_world*100)/Confirmed_cases_for_world
#Cases only for China province
China=sort_By_Confirmed_cases[sort_By_Confirmed_cases['Country']=='Mainland China']
Recovered_rate_for_china=(int(China['Recovered'].values)*100)/int(China['Confirmed'].values)
#mapping all the data in a table
Set1={'Total Number of Confirmed cases in the World':Confirmed_cases_for_world,'Total Number of Death cases in the World':Deaths_cases_for_world,'Total Number of Recovered cases in the World':Recovered_cases_for_world,'Total Number of Active Cases':Active_cases,
'Rate of the Recovered Cases in the world':Recovered_rate,'Rate of the Death Cases in the world %':Deaths_rate,'Rate of the Recovered Cases in the China %':Recovered_rate_for_china}
Set1=pd.DataFrame.from_dict(Set1, orient='index' ,columns=['Total'])
print("Data is till 04/04/2020")
Set1.style.background_gradient(cmap='Reds')
#Finding the Recovered Rate
Recovered_rate=(sort_By_Confirmed_cases['Recovered']*100)/sort_By_Confirmed_cases['Confirmed']
#Finding the Death Rate
Deaths_rate=(sort_By_Confirmed_cases['Deaths']*100)/sort_By_Confirmed_cases['Confirmed']
#Finding the COnfirmed cases Rate
cases_rate=(sort_By_Confirmed_cases.Confirmed*100)/Confirmed_cases_for_world
sort_By_Confirmed_cases['Active Cases']=sort_By_Confirmed_cases['Confirmed']-sort_By_Confirmed_cases['Deaths']-sort_By_Confirmed_cases['Recovered']
sort_By_Confirmed_cases['% of Recovered cases']=pd.DataFrame(Recovered_rate)
sort_By_Confirmed_cases['% of Death cases']=pd.DataFrame(Deaths_rate)
sort_By_Confirmed_cases['% of Total cases']=pd.DataFrame(cases_rate)
print("Sorted By Confirmed Cases")
#sort_By_Confirmed_cases.style.background_gradient(cmap='Reds')
sort_By_Confirmed_cases.style.background_gradient(cmap="Blues", subset=['Confirmed', 'Active Cases','Total Cases Rate %',])\
.background_gradient(cmap="Greens", subset=['Recovered','% of Recovered cases'])\
.background_gradient(cmap="Reds", subset=['Deaths','% of Death cases'])
sort_By_Confirmed_cases.head()
#finding the total number of confirmed cases in China till date
sort_By_Confirmed_cases[sort_By_Confirmed_cases['Country']=='Mainland China']
#Sort the data based on COnfirmed Cases and group by country
df=sort_By_Confirmed_cases.groupby(['Country']).sum().sort_values(by ='Confirmed',ascending=False)
df.reset_index(level=0, inplace=True)
df.head()
#Sort the data based on confirmed cases based on COuntry
Country_wise_Confirmed = sort_By_Confirmed_cases[sort_By_Confirmed_cases['Confirmed']>0][['Country', 'Confirmed']]
Country_wise_Confirmed.sort_values('Confirmed', ascending=False).reset_index(drop=True).style.background_gradient(cmap='Greens')
#Bar plot for confirmed cases for top 20 countries
fig = px.bar(Country_wise_Confirmed.sort_values('Confirmed', ascending=False).head(20),
x="Country", y="Confirmed", color='Confirmed',
height=800, width=1000,
title='Number of Confirmed Cases in World in top 20 countries')
#fig.update_traces(text=Country_wise_Confirmed['Confirmed'], textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()
#Sort data based on recovered cases in the world
Country_wise_Recovered = sort_By_Confirmed_cases[sort_By_Confirmed_cases['Recovered']>0][['Country', 'Recovered']]
#Country_wise_Recovered['Recovered / 100 Cases'] = round((sort_By_Confirmed_cases['Recovered']/sort_By_Confirmed_cases['Recovered'])*100, 2)
Country_wise_Recovered.sort_values('Recovered', ascending=False).reset_index(drop=True).style.background_gradient(cmap='Greens')
#Bar plot for Recovered cases for top 20 countries
fig = px.bar(Country_wise_Recovered.sort_values('Recovered', ascending=False).head(20),
x="Country", y="Recovered", color='Recovered',
height=800, width=1000,
title='Number of Recovered in World in top 20 countries')
#fig.update_traces(text=Country_wise_Recovered['Recovered'], textposition='outside')
fig.update_layout(uniformtext_minsize=10, uniformtext_mode='hide')
fig.show()
#sorting the data based on deaths in country
Country_wise_deaths = sort_By_Confirmed_cases[sort_By_Confirmed_cases['Deaths']>0][['Country', 'Deaths']]
Country_wise_deaths.sort_values('Deaths', ascending=False).reset_index(drop=True).style.background_gradient(cmap='Reds')
#Bar graph to plot country wise deaths for top 20 countries
fig = px.bar(Country_wise_deaths.sort_values('Deaths', ascending=False).head(20),
x="Country", y="Deaths", color='Deaths',
height=600, width=1000,
title='Number of Deaths in World in top 20 countries')
#fig.update_traces(text=Country_wise_deaths['Deaths'], textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
#Scatter plot based on deaths vs confirmed cases
fig = px.scatter(sort_By_Confirmed_cases.sort_values('Deaths', ascending=False).iloc[:20, :],
x='Confirmed', y='Deaths', color='Country', size='Confirmed', height=800,
text='Country', log_x=True, log_y=True, title='Deaths cases vs Confirmed cases ')
fig.update_traces(textposition='top center')
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
#Dgroup data based on date to find number of cases per day
Daily_cases = data.groupby(["Date"])['Confirmed','Deaths', 'Recovered'].sum().reset_index()
#Sort value based on date
sorted_By_Confirmed_cases_per_day=Daily_cases.sort_values('Date',ascending=False)
print(sorted_By_Confirmed_cases_per_day)
###Ratio of Death cases with respect to recovered cases function
def Ratio_of_Death_Recovered(data_frame):
figure, axes1 = plt.subplots(1,1,figsize=(20,7))
axes1.plot(data_frame['Deaths']/data_frame['Confirmed'], 'r', label='Death Ratio')
axes1.legend(loc='upper left')
axes1.set_xticklabels(sorted_By_Confirmed_cases_per_day.index, rotation=75)
axes1.set_ylabel('Death Ratio', fontsize=15, color='r')
axes2=axes1.twinx()
# ax2._get_lines.prop_cycler = ax1._get_lines.prop_cycler
axes2.plot(data_frame['Recovered']/data_frame['Confirmed'], 'g', label='Recovered Ratio')
axes2.legend(loc='upper center')
axes2.set_ylabel('Recovered Ratio', fontsize=15, color='g')
###Ratio of Death cases with respect to recovered cases function is called
Ratio_of_Death_Recovered(sorted_By_Confirmed_cases_per_day)
#assigning variable to axes
x=Daily_cases.index
y=Daily_cases.Confirmed
y1=Daily_cases.Deaths
y2=Daily_cases.Recovered
#calling seaborn library and set the style to whitegrid
sns.set(style="whitegrid")
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(12,10 ))
#plot the scatter plot
plt.scatter(x,y,color='Green' , label='Confirmed Cases')
plt.scatter(x,y1,color='red' ,label="Deaths Cases")
plt.scatter(x,y2,color='yellow',label="Recovered Cases")
plt.title("Increasing Coronavirus cases in the world per day .")
ax.legend(ncol=2, loc='upper left', frameon=True)
plt.show()
#Coronavirus cases statistics per day
sorted_By_Confirmed_cases_per_day.style.background_gradient(cmap='Reds')
#Bar plot for Increasing Number of Confirmed cases in World in top 20 countries on a daily basis
fig = px.bar(sorted_By_Confirmed_cases_per_day.head(20),
x="Date", y="Confirmed", color='Confirmed',
height=600, width=1000,
title='Increasing Number of Confirmed cases in World on daily basis')
fig.update_traces(text=sorted_By_Confirmed_cases_per_day['Confirmed'], textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
#Bar plot for Increasing Number of Recovered cases in World in top 20 countries on a daily basis
fig = px.bar(sorted_By_Confirmed_cases_per_day.head(20),
x="Date", y="Recovered", color='Recovered',
height=600, width=1000,
title='Increasing Number of Recovered cases in World on daily basis')
fig.update_traces(text=sorted_By_Confirmed_cases_per_day['Recovered'], textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
#Bar plot for Increasing Number of Deaths cases in World in top 20 countries on a daily basis
fig = px.bar(sorted_By_Confirmed_cases_per_day.head(20),
x="Date", y="Deaths", color='Deaths',
height=600, width=1000,
title='Increasing Number of Deaths cases in World on daily basis')
fig.update_traces(text=sorted_By_Confirmed_cases_per_day['Deaths'], textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
#geographical map based on Coronavirus spread
fig = px.choropleth(sort_By_Confirmed_cases, locations="Country", locationmode='country names',
color="Confirmed", hover_name="Country", range_color= [0, 40], projection="natural earth",
title='Coronavirus Spread across the world')
fig.update(layout_coloraxis_showscale=False)
fig.show()
#geographical map based on Coronavirus confirmed cases spread
fig = px.choropleth(sort_By_Confirmed_cases, locations="Country",
locationmode='country names', color=np.log(sort_By_Confirmed_cases["Confirmed"]),
hover_name="Country", hover_data=['Confirmed'],
color_continuous_scale="Sunsetdark",
title='Countries with having Confirmed Cases')
fig.update(layout_coloraxis_showscale=False)
fig.show()
We are going to use Prophet for prediction which is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. Benefits: Accurate and fast / Fully automatic / robust to missing data and shifts in the trend / easy to use Limitations: It works best with time series that have strong seasonal effects and several seasons of historical data. Usage: It only takes 2 inputs "ds" = timeseries and "y" = object of analysis value.
Using m.predict function for Prophet package forecast future 5 days confirmed cases development
Measure Prediction accuracy using the mean absolute percentage error, a.k.a MAPE. For example trend estimation, it fits with our needs the following Python function was written to help us when calculating the MAPE value of the model
Using Prophet Cross validation to predict and again measure using MAPE
Polynomial Regression to Predict future cases
Narrowing down our analysis to Ireland COVID19 growth rate
#define https paths for datasources "new daily data is updated automatically", hence easy for us to query results in future.
confirmed_global_path = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
deaths_global_path = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
recovered_global_path = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
#create python functions to be used in melting, renaming and tranforming dates on the 3 available files.
#functions created to avoid work repetition while working thru the 3 different files.
def organize_data(path_url,case_type):
df = pd.read_csv(path_url)
melted_df = df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'])
melted_df.rename(columns={"Province/State":"Province","Country/Region":"Country"},inplace=True)
melted_df.rename(columns={"variable":"Date","value":case_type},inplace=True)
melted_df["Date"] = pd.to_datetime(melted_df["Date"])
return melted_df
def combine_data(confirm_df,recovered_df,deaths_df):
combined_df = confirm_df.join(recovered_df['Recovered']).join(deaths_df['Deaths']) #join since countries index are the same
return combined_df
#call function to get and organize data into new dfs
confirmed_df = organize_data(confirmed_global_path,"Confirmed")
recovered_df = organize_data(recovered_global_path,"Recovered")
deaths_df = organize_data(deaths_global_path,"Deaths")
#calling function to combine data from the previous created dfs
covid_df = combine_data(confirmed_df,recovered_df,deaths_df)
covid_df.head(2)
#create daily wise Data frame required for timeseries predictions
df_daily_wise = covid_df.groupby("Date")[['Confirmed','Recovered', 'Deaths']].sum()
df_daily_wise.head(2)
#after grouping function is important to reset index, required for timeseries predictions
worldwide_cases = df_daily_wise.reset_index()
worldwide_cases.head(2)
#since Prophet only takes 2 inputs, we will focus prediction analysis on confirmed cases, and rename accordingly to req.
confirmed_cases = worldwide_cases[["Date","Confirmed"]]
confirmed_cases.columns = ['ds','y'] # renaning required for inputs into the Prophet model
confirmed_cases.tail(2)
#load model from Facebook Prophet package
#The easiest way to install Prophet is through conda-forge: conda install -c conda-forge fbprophet.
from fbprophet import Prophet
#separate training and testing datasets. After some tests and due to exp growth rate, we decided to keep the predictin range of 5 days
train_df = confirmed_cases.loc[confirmed_cases.ds <= '2020-03-29']
test_df = confirmed_cases.loc[confirmed_cases.ds > '2020-03-29']
# initialize model
m = Prophet()
# train and fit model with training dataset using m.fit() funciont from prophet
m.fit(train_df)
#make future predictions. after several testing with different periods, we've noted that 5 days window works better
#prophet function m.make_future_dataframe
future = m.make_future_dataframe(periods=5)
future.tail(5)
# utilize m.predict function for Prophet package to forecast future 5 days confirmed cases development
forecast = m.predict(future)
m.plot(forecast)
#check table with predicted values, in Prophet predict is named as yhat with lower and upper values
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(5)
#in order to compare predict values (yhat) against testing data frame, lets merge then
comparison_df = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(5)
comparison_df = pd.merge(comparison_df, test_df,on="ds")
comparison_df.tail(5)
Step 10: we will be using the mean absolute percentage error, a.k.a MAPE since MAPE, is measure of prediction accuracy of a forecasting method, for example trend estimation, it fits with our needs the following Python function was written to help us when calculating the MAPE value of the model
this formula was built based on the article written by Ruan van der Merwe (2018) available at: https://towardsdatascience.com/implementing-facebook-prophet-efficiently-c241305405a3
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
#call function to calculate MAPE
mape = mean_absolute_percentage_error(comparison_df.y, comparison_df.yhat)
print('MAPE: \n', mape.round(2),"%")
Step 10: Interpretating results of MAPE
MAPE of 21.16%, which in other words indicates that over all the points predicted, we are out with an average of 21.16% from the actual value.
Prophet allows the user to work in tunning the model with. For example the growth parameter, used by defaults was linear, for testing, we did change it to "logistic", but it then required a maxium value "CAP" for "y", which for our study, means establishing a maxium espected value of confirmed cases by an expert in the domain, which with the current world scenario, we felt that it din't make sense.
#import function
from fbprophet.diagnostics import cross_validation
# new dt with cross validation function using previous model "m",
#We specify the forecast horizon "horizon", initial and period are optionals
df_cv = cross_validation(m, horizon = '5 days')
df_cv.tail()
#lets measure the MAPE after runnin the cross validation
mape = mean_absolute_percentage_error(df_cv.y, df_cv.yhat)
print('MAPE: \n', mape.round(2),"%")
from fbprophet.plot import add_changepoints_to_plot
# using add_changepoints_to_plot function
fig = m.plot(forecast)
c = add_changepoints_to_plot(fig.gca(),m,forecast)
Scikit-learn a.ka. sklearn, is a free software machine learning library for the Python programming language. It features diffrent ML algorithms, which we will use in the following analysis. (Linear regression)
#import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
#create daily wise Data frame required for timeseries predictions with index reseted
daily_cases = covid_df.groupby(["Date"])['Confirmed','Deaths', 'Recovered'].sum().reset_index()
daily_cases.head(2)
#separate Train data "x confirmed in function of y daylywise developments - variables for regression model"
x_df=pd.DataFrame(daily_cases.index)
y_df=pd.DataFrame(daily_cases.Confirmed)
#training and test split from sklearn functions
x_train,x_test,y_train,y_test=train_test_split(x_df,y_df,test_size=0.1,random_state=0)
#create linear regression object and polynomial features 5 degrees
#The following code, was inspired on the work published by Abdulrhman Alothman (2020) available at Kaggle.
poly_reg=PolynomialFeatures(degree=5)
x_poly=poly_reg.fit_transform(x_train)
lin_reg2=LinearRegression()
lin_reg2.fit(x_poly,y_train)
#model graphic for polynomial regression
cases_per_Day = covid_df.groupby(["Date"])['Confirmed','Deaths', 'Recovered'].sum().reset_index()
sorted_By_Confirmed1=cases_per_Day.sort_values('Date',ascending=False)
x=cases_per_Day.index
y=cases_per_Day.Confirmed
plt.scatter(x,y,color='red')
plt.plot(x_test,lin_reg2.predict(poly_reg.fit_transform(x_test)),color='blue')
plt.title("Polynomial Regression Model ")
plt.show()
#now, let's test the algorithm
y_pred=lin_reg2.predict(poly_reg.fit_transform(x_test))
result=pd.DataFrame(y_pred)
result['Real Value']=y_test.iloc[:,:].values
result['Predicted Value']=pd.DataFrame(y_pred)
result=result[['Real Value','Predicted Value']]
result
print('Polynomial Regession R2 Score : ',r2_score(y_test, y_pred))
#lets measure the MAPE after runnin the cross validation
mape = mean_absolute_percentage_error(result['Real Value'], result['Predicted Value'])
print('MAPE: \n', mape.round(2),"%")
#today is 04/04/2020
print("After {0} day will be {1} case in the world".format((75-len(cases_per_Day)),lin_reg2.predict(poly_reg.fit_transform([[75]]))))
print("After {0} day will be {1} case in the world".format((77-len(cases_per_Day)),lin_reg2.predict(poly_reg.fit_transform([[77]]))))
print("After {0} day will be {1} case in the world".format((87-len(cases_per_Day)),lin_reg2.predict(poly_reg.fit_transform([[87]]))))
#Since we want to narrow down our analysis and explore some countries similatiries, import relavant data
#Countries indicators "Population, GDP" available at: World Bank Data - https://data.worldbank.org/
global_population_path = 'C:/Users/alexz/Desktop/Business Analytics/2nd Semester/Advanced Programming for Business Analytics/Assignments/Group Assignment/WorldDevelopmentIndicator_Population.csv'
global_gdp_path = 'C:/Users/alexz/Desktop/Business Analytics/2nd Semester/Advanced Programming for Business Analytics/Assignments/Group Assignment/WorldDevelopmentIndicator_GDP.csv'
#countries Human Development Index "HDI" available at: http://hdr.undp.org/en/indicators/137506#
global_hdi_path = 'C:/Users/alexz/Desktop/Business Analytics/2nd Semester/Advanced Programming for Business Analytics/Assignments/Group Assignment/UnitedNation_HumanDevelopmentIndex.csv'
#Since we have 3 different and big files to clean and merge "over 265 rowns and 133 columns in 3 files"
#create python functions to automate reading, cleaning and join procedures "only 2017 data = most updated and completed"
def read_n_clean_data(path_url,indicator_type):
df = pd.read_csv(path_url)
df.rename(columns={df.columns[0]:"Country"},inplace=True)
df = df[['Country', '2017']]
df.rename(columns={df.columns[1]:indicator_type},inplace=True)
return df
def combine_data(population_df,gdp_df,hdi_df):
first_df = pd.merge(population_df, gdp_df, on="Country") #first merge all same 263 rowns, same web source: World Bank
combined_df = pd.merge(first_df, hdi_df,on="Country", how="left") #second merge how=left, since diff source without the 256 rows
return combined_df
#call functions to read and clean csv datasets "over 263 rowns and 158 columns in 3 files"
population_df = read_n_clean_data(global_population_path,"population")
gdp_df = read_n_clean_data(global_gdp_path,"gdp")
hdi_df = read_n_clean_data(global_hdi_path,"hdi")
#call combine_data function to merge imported and cleaned data
indicator_df = combine_data(population_df,gdp_df,hdi_df)
indicator_df.tail()
#since we want to filter by continent = Asia "COVID-19 initial hub", get country vs continen table from github
country_continent_path = 'https://raw.githubusercontent.com/dbouquin/IS_608/master/NanosatDB_munging/Countries-Continents.csv'
df_continents = pd.read_csv(country_continent_path)
df_continents = df_continents[["Country","Continent"]]
df_continents.tail(2)
# final pandas merge, to join continent information. merge on left to keep complete daset, even with missing results.
countries_df = pd.merge(indicator_df, df_continents,on="Country", how="left")
countries_df.loc[countries_df['Country'] == "Ireland"]
#Narrowing down analysis, we will identify country with similar indicators as Ireland and copare COVID-19 developmnet growth
countries_df.loc[(countries_df['population'] >= 4000000) & (countries_df['population'] <= 6000000) &
(countries_df['Continent'].isin(['Asia', 'Oceania'])) &
(countries_df['hdi'] >= "0.9")]
#now that we identifie similar countries, we are instered in the Asia one, since it has been hitten earlier by the outbreak.
#for the next step of our analysis, we want to identifiew the weekly growth rate per cases and plot them for comparison
#the following function, was created to reduce code lenght and easyness of usage when queryin information from different countries
def calculate_growth_rate(df,country):
df = df.loc[(df["Country"]) == country] #get covid global dataset and filter by country parameter input
df = df.copy() #important to create a copy to avoid Warning message of pastin into sliced DF
df["Weeknum"] = df["Date"].dt.week #get week number based on reported date to group them and find growth rate
df_weekly_growth = df.groupby(["Weeknum"])['Confirmed'].max().reset_index() #since values are reported accumulated. group and get max value for each week
df_weekly_growth['GrowthRate'] = df_weekly_growth.Confirmed.pct_change().mul(100).round(2) #pct_change() function calculates the percentage change between the current and a prior element
df_weekly_growth["GrowthRate"] = df_weekly_growth["GrowthRate"].replace(np.inf, np.nan) # replace infitine division by NAN
df_weekly_growth["GrowthRate"] = df_weekly_growth["GrowthRate"].replace(np.nan, 0) # replace NAN by 0 for plotting and column formatting requirements
return df_weekly_growth
#call created function to get Ireland and Singapore weekly growth rates into a DT
weekly_growth_singapore = calculate_growth_rate(covid_df,"Singapore")
weekly_growth_ireland = calculate_growth_rate(covid_df,"Ireland")
weekly_growth_singapore # view weekly growth rate table for singapore
# view weekly growth rate table for singapore
weekly_growth_ireland
#plotting Singapore growth rate curve
plt.plot(weekly_growth_singapore["Weeknum"], weekly_growth_singapore["GrowthRate"])
plt.title("Singapore Weekly Growth Rate Curve")
plt.show()
#plotting Ireland growth rate curve. Since first cases were reported only in Week 9 "corresponding to week 4 from singapore"
#we can expect that growth rates per week will growth althought in a lower rate, due to lockdown measures placed in the country.
plt.plot(weekly_growth_ireland["Weeknum"], weekly_growth_ireland["GrowthRate"])
plt.title("Ireland Weekly Growth Rate Curve")
plt.show()
Considering the current world scenario, where different governments took different public and health measures. Prediction of coronavirus worlwide spread development it's a very challenging task and thus raise future questions to be investigated, for example, what's the relevance of environmental factors (weather), population density or even nearby airport traffic.